#' ---
#' title: "Greenwashing the Future? Computational Text Analysis of Environmental Reporting from the Fossil Fuel Industry"
#' subtitle: "01_dictionary_analysis.R"
#' author: "Robin Rauner"
#' date: "Note: Code compiled successfully on `r format(Sys.time(), '%d %B %Y')`"
#' ---

# load packages
library(quanteda)  # CRAN v4.1.0
library(tidyverse) # CRAN v2.0.0

sessionInfo()

###### IMPORT & PREPARE DATA ######

# text corpus
dat_messages <- read.csv("messages_text_corpus.csv", fileEncoding = "utf-8") |>
  select(-X)

dim(dat_messages)
names(dat_messages)

# Dictionary terms
fls_terms <- read.csv("fls_terms_no_glob.csv", fileEncoding = "utf-8")
dim(fls_terms)

# print terms
fls_terms |>
  summarise(Terms = paste(term, collapse = ", "))


###### DICTIONARY CLASSIFICATION ######

# create dictionary
dict_fls <- dictionary(list(fls = fls_terms$term))

# convert text data to a corpus
corp_messages <- corpus(dat_messages)

# tokenize and process
toks <- tokens(corp_messages,
  remove_numbers = TRUE,
  remove_punct = TRUE,
  split_hyphens = FALSE,
  padding = TRUE
) |>
  tokens_tolower()

# apply fls dictionary
dfmat <- tokens_lookup(toks, dictionary = dict_fls) |> dfm()

# apply boolean weighting
dfmat_weighted <- dfm_weight(dfmat, scheme = c("boolean"))

# transform to a data frame
dat_dict <- quanteda::convert(dfmat_weighted, to = "data.frame") |>
  cbind(docvars(dfmat_weighted))

# merge with text
dat_dict_text <- dat_dict |>
  left_join(dat_messages, by = "doc_id") |>
  select(-company.y) |>
  rename(company = company.x)

# save data
write.csv(
  dat_dict_text,
  "dictionary_annotated_data.csv"
)